compute tf-idf matrix from coocurrency matrix of train and test sets and then use tf-idf matrices to compute cosine-similarity (linear kernel) between entries in train and test set to make predictions. Exchange the predictions made using cosine similiarity for duplicates to values of train set in the end and save result in submission file
In [1]:
import pandas as pd
import numpy as np
import string
import time
from scipy.sparse import *
from scipy.io import mmwrite, mmread
import csv
from bs4 import BeautifulSoup
from nltk.tag import brill
from taggerfunctions import *
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from ast import literal_eval
import sklearn as sk
import gc
import os
import psutil
In [2]:
def getRealDict(fname):
dictWords = {}
with open(fname, 'r') as f:
reader = csv.reader(f)
dictWords = {literal_eval(rows[0]):rows[1] for rows in reader}
return dictWords
def getDict(fname):
dictWords = {}
with open(fname, 'r') as f:
reader = csv.reader(f)
dictWords = {rows[0]:literal_eval(rows[1]) for rows in reader}
return pd.Series(dictWords)
def getInvDict(fname):
dictWords = {}
with open(fname, 'r') as f:
reader = csv.reader(f)
invDictWords = {literal_eval(rows[0]):rows[1] for rows in reader}
return pd.Series(invDictWords)
In [3]:
def save_results(predictions, filename):
"""Given a vector of predictions, save results in CSV format."""
with open(filename, 'w') as f:
f.write("Id,Tags\n")
for i, pred in predictions.iteritems():
f.write(str(i) + ",\"" + pred + "\"" + "\n")
In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
In [6]:
coocMatTitle_coo = mmread("coocMatTitleNew_coo.mtx")
coocMatTitle_csr = coocMatTitle_coo.tocsr()
In [4]:
coocMatBodyFull_csr = mmread("coocMatBodyFull2_csr.mtx")
coocMatBodyFull_csr = coocMatBodyFull_csr.tocsr()
Out[4]:
In [9]:
tfidf = TfidfTransformer(norm="l2")
tf_idf_matrix_title = tfidf.fit_transform(coocMatTitle_csr)
tf_idf_matrix_body = tfidf.fit_transform(coocMatBodyFull_csr)
In [10]:
mmwrite("tfidfMatTitle.mtx",tf_idf_matrix_title)
mmwrite("tfidfMatBody.mtx",tf_idf_matrix_body)
In [11]:
testWTitle = mmread("testWordsQTitle_0-200000.mtx")
In [12]:
testWTitle
Out[12]:
In [13]:
tf_idf_matTestTitle = tfidf.fit_transform(testWTitle)
In [15]:
tf_idf_matTestTitle
Out[15]:
In [4]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(norm="l2")
In [5]:
tf_idf_title = mmread("tfidfMatTitle.mtx")
tf_idf_title = tf_idf_title.tocsr()
In [6]:
tf_idf_body = mmread("tfidfMatBody.mtx")
tf_idf_body = tf_idf_body.tocsr()
In [ ]:
frange = ["0-200000","200000-400000","400000-600000","600000-800000","800000-1000000","1000000-1200000","1200000-1400000",
"1400000-1600000","1600000-1800000","1800000-2000000","2000000-2013337"]
invDictKeys = getInvDict("invdictKeys.csv")
invDictWords = getInvDict("invdictWordsNew.csv")
result_tags = {}
countRows = 0
""" choose a number for chunk_size that is a divisor of the number of rows in testWordsQ;
otherwise the iterator will skip the last entries!"""
chunk_size = 1000
tStart = time.time()
for fran in frange:
fname = "invdictIdTest_" + fran + ".csv"
""" make sure that invDictIdTest is sorted!!! use getInvDict or sort dict after using getDict """
invDictIdTest = getInvDict(fname)
fname = "testWordsQTitle_" + fran + ".mtx"
testQTitle = mmread(fname)
testQTitle = testQTitle.tocsr()
tf_idf_matTestTitle = tfidf.fit_transform(testQTitle)
#fname = "testWordsQBody_" + fran + ".mtx"
#testQBody = mmread(fname)
#testQBody = testQBody.tocsr()
#tf_idf_matTestBody = tfidf.fit_transform(testQBody)
for idx_chunk in xrange(len(invDictIdTest)/chunk_size):
cs_title = linear_kernel(tf_idf_matTestTitle[chunk_size*idx_chunk:chunk_size*(idx_chunk+1),:], tf_idf_title)
#cs_body = linear_kernel(tf_idf_matTestBody[chunk_size*idx_chunk:chunk_size*(idx_chunk+1),:], tf_idf_body)
for row_idx,row in enumerate(cs_title):
rel_tags = row.argsort()[:-20:-1]
ar = []
words = []
for idx in testQTitle[row_idx,:].nonzero()[1]:
words.append(invDictWords[idx])
for idx in rel_tags:
if invDictKeys[idx] in words:
ar.append(invDictKeys[idx])
for idx in rel_tags[0:2]:
if invDictKeys[idx] not in ar:
ar.append(invDictKeys[idx])
result_tags[invDictIdTest[countRows%200000]] = " ".join(ar)
countRows += 1
del cs_title
gc.collect()
if countRows % 20000 == 0:
print("{0:d} questions finished in {1:.0f}s".format(countRows, time.time()-tStart))
tStart = time.time()
""" the linear_kernel below is necessary because matrix size 13337 is not divisible by chunk_size;
it will skip the last part in the for loop above and i have to do it manually below """
cs_title = linear_kernel(tf_idf_matTestTitle[len(invDictIdTest) - len(invDictIdTest)%chunk_size:,:], tf_idf_title)
#cs_body = linear_kernel(tf_idf_matTestBody[len(invDictIdTest) - len(invDictIdTest)%chunk_size:,:], tf_idf_body)
for row_idx,row in enumerate(cs_title):
rel_tags = row.argsort()[:-20:-1]
ar = []
words = []
for idx in testQTitle[row_idx,:].nonzero()[1]:
words.append(invDictWords[idx])
for idx in rel_tags:
if invDictKeys[idx] in words:
ar.append(invDictKeys[idx])
for idx in rel_tags[0:2]:
if invDictKeys[idx] not in ar:
ar.append(invDictKeys[idx])
result_tags[invDictIdTest[countRows%200000]] = " ".join(ar)
countRows += 1
""" save the resulting dict in file """
result_tagsSeries = pd.Series(result_tags)
result_tagsSeries.to_csv("resultTags_title_v3.csv")
In [ ]:
frange = ["0-200000","200000-400000","400000-600000","600000-800000","800000-1000000","1000000-1200000","1200000-1400000",
"1400000-1600000","1600000-1800000","1800000-2000000","2000000-2013337"]
invDictKeys = getInvDict("invdictKeys.csv")
result_tags = {}
countRows = 0
""" choose a number for chunk_size that is a divisor of the number of rows in testWordsQ;
otherwise the iterator will skip the last entries!"""
chunk_size = 500
tStart = time.time()
for fran in frange:
fname = "invdictIdTest_" + fran + ".csv"
""" make sure that invDictIdTest is sorted!!! use getInvDict or sort dict after using getDict """
invDictIdTest = getInvDict(fname)
fname = "testWordsQTitle_" + fran + ".mtx"
testQTitle = mmread(fname)
tf_idf_matTestTitle = tfidf.fit_transform(testQTitle)
fname = "testWordsQBody_" + fran + ".mtx"
testQBody = mmread(fname)
testQBody = testQBody.tocsr()
tf_idf_matTestBody = tfidf.fit_transform(testQBody)
del testQTitle
del testQBody
gc.collect()
for idx_chunk in xrange(len(invDictIdTest)/chunk_size):
cs_title = linear_kernel(tf_idf_matTestTitle[chunk_size*idx_chunk:chunk_size*(idx_chunk+1),:], tf_idf_title)
cs_body = linear_kernel(tf_idf_matTestBody[chunk_size*idx_chunk:chunk_size*(idx_chunk+1),:], tf_idf_body)
for row in cs_title:
rel_tags = (row).argsort()[:-4:-1]
ar = []
for idx in rel_tags:
ar.append(invDictKeys[idx])
result_tags[invDictIdTest[countRows%200000]] = " ".join(ar)
countRows += 1
del cs_title
gc.collect()
if countRows % 20000 == 0:
print("{0:d} questions finished in {1:.0f}s".format(countRows, time.time()-tStart))
tStart = time.time()
""" the linear_kernel below is necessary because matrix size 13337 is not divisible by chunk_size;
it will skip the last part in the for loop above and i have to do it manually below """
cs_title = linear_kernel(tf_idf_matTestTitle[len(invDictIdTest) - len(invDictIdTest)%chunk_size:,:], tf_idf_title)
cs_body = linear_kernel(tf_idf_matTestBody[len(invDictIdTest) - len(invDictIdTest)%chunk_size:,:], tf_idf_body)
for lst in cs_title:
rel_tags = lst.argsort()[:-4:-1]
for key in rel_tags:
l.append(invDictKeys[key])
result_tags[invDictIdTest[countRows%200000]] = " ".join(l)
countRows += 1
""" save the resulting dict in file """
result_tagsSeries = pd.Series(result_tags)
result_tagsSeries.to_csv("resultTags_T+B_v2.csv")
In [37]:
result_tagsSeries_1 = getRealDict("resultTags_T+B_v2.csv")
#result_tagsSeries_1 = pd.Series(result_tagsSeries_1)
In [38]:
result_tagsSeries_2 = getRealDict("resultTags_T+B_v2_2.csv")
#result_tagsSeries_2 = pd.Series(result_tagsSeries_2)
In [39]:
result_tagsSeries_full = dict(list(result_tagsSeries_1.items()) + list(result_tagsSeries_2.items()))
In [40]:
result_tagsSeries_full1 = pd.Series(result_tagsSeries_full)
In [43]:
result_tagsSeries_full1.to_csv("resultTags_T+B_v2_full.csv")
In [9]:
duplicates = pd.read_csv("duplicates_single_v2.csv", index_col='Unnamed: 0')
In [10]:
dup = pd.Series(duplicates['Tags'].values, index=duplicates['Id_x'])
result = getRealDict("resultTags_title_v3.csv")
In [11]:
for key, value in dup.iteritems():
result[key] = value
In [12]:
save_results(result, "submission_tfidf_titleonly_v3.csv")